# function for fixing legend
make_nice_effect_quantiles <- function(df, invar, outvar) {
  mutate(
    df,
    {{ outvar }} := recode({{ invar }}, 
                           effect_20 = "Effect at 20% quantile",
                            effect_50 = "Effect at 50% quantile",
                            effect_80 = "Effect at 80% quantile")
    )
}

effect_scale <- list(
  scale_color_manual(values = c(
    "Effect at 20% quantile" = colorspace::lighten("#007FA8", .7),
    "Effect at 50% quantile" = colorspace::lighten("#007FA8", .4),
    "Effect at 80% quantile" = "#007FA8"
  ))
)
hm <- read_rds("final_models/17-brm-large-sample.rds.bz2")

hm_simple <- brm(file = "final_models/hm_final_rerun.rds", file_refit = "never")

In this last round, the sampler issued warnings about hitting the maximum treedepth:

Warning: 4000 of 4000 (100.0%) transitions hit the maximum treedepth limit of 10.

This was to be expected, given the model took about twice as long to run as in previous iterations. The only aspects that changed were: revision of weights and resampling of data. Given that we had a slightly smaller sample size, it could be that some peculiarities of the specific sample led to inefficient sampling. Inference is likely to be unaffected.

Posterior predictive check

pp_check(hm, ndraws = 10, cores = mc_cores) +
  coord_cartesian(xlim = c(0, 8000))

pred_vis <- function(df, model, country_selection,
                     var_for_offset = base$P_top10, alpha = 1, ndraws = 1000) {
  scale_offset <- attributes(var_for_offset)[["scaled:center"]]
  get_back <- function(df) mutate(df, P_top10 = exp(scale_offset + P_top10))
  
  df %>%
    filter(country == country_selection) %>%
    modelr::data_grid(P_top10, country, field) %>%
    add_predicted_draws(model, ndraws = ndraws, re_formula = NULL, 
                        cores = mc_cores) %>%
    get_back() %>% 
    ggplot(aes(P_top10, .prediction)) +
    stat_interval() +
    scale_color_manual(values = colorspace::lighten(clrs[4], c(.8, .67, .42))) +
    scale_y_continuous(labels = dollar) +
    geom_jitter(aes(y = APC_in_dollar), alpha = alpha, 
                position = position_jitter(width = 5, height = 50),
                data = filter(df, country == country_selection) %>% get_back()) +
    facet_wrap(vars(field)) +
    labs(y = "Predicted vs. actual APC", x = expression(P["top 10%"]),
         color = "Credible interval") +
    # theme_minimal(base_family = "Hind") +
    theme_clean() +
    theme(legend.position = "bottom", panel.grid.minor = element_blank())
}
pred_vis(base, hm, "Austria", alpha = .5)

pred_vis(base, hm, "Brazil", alpha = .2)

This updated model fares much better for Brazil. The predictions are still not ideal (underestimating higher APCs of ~2000), but overall much better than the previous model.

pred_vis(base, hm, "China", alpha = .15)

pred_vis(base, hm, "United States", alpha = .2)

pred_vis(base, hm, "Turkey", alpha = .7)

PP check for simple model

pp_check(hm_simple, ndraws = 10, cores = mc_cores) +
  coord_cartesian(xlim = c(0, 8000))

pred_vis(base, hm_simple, "Brazil", alpha = .2)

Model variances and covariances

summary(hm) 
##  Family: mixture(hurdle_lognormal, hurdle_lognormal) 
##   Links: mu1 = identity; sigma1 = identity; hu1 = logit; mu2 = identity; sigma2 = identity; hu2 = logit; theta1 = identity; theta2 = identity 
## Formula: APC_in_dollar | weights(total_weight) ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country) 
##          hu1 ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country)
##          hu2 ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country)
##          theta1 ~ 1 + (1 | field)
##    Data: base (Number of observations: 179710) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Group-Level Effects: 
## ~country (Number of levels: 69) 
##                                Estimate Est.Error l-95% CI u-95% CI Rhat
## sd(mu1_Intercept)                  0.46      0.05     0.38     0.56 1.01
## sd(mu1_P_top10)                    0.15      0.02     0.11     0.20 1.00
## sd(mu2_Intercept)                  0.03      0.00     0.02     0.04 1.00
## sd(mu2_P_top10)                    0.01      0.00     0.01     0.02 1.00
## sd(hu1_Intercept)                  1.39      0.20     1.05     1.82 1.00
## sd(hu1_P_top10)                    0.48      0.14     0.22     0.79 1.01
## sd(hu2_Intercept)                  1.86      0.20     1.51     2.28 1.00
## sd(hu2_P_top10)                    0.65      0.10     0.48     0.87 1.00
## cor(mu1_Intercept,mu1_P_top10)    -0.18      0.16    -0.49     0.15 1.00
## cor(mu2_Intercept,mu2_P_top10)    -0.36      0.19    -0.66     0.06 1.00
## cor(hu1_Intercept,hu1_P_top10)    -0.35      0.21    -0.70     0.10 1.00
## cor(hu2_Intercept,hu2_P_top10)     0.20      0.17    -0.13     0.51 1.00
##                                Bulk_ESS Tail_ESS
## sd(mu1_Intercept)                   821     1742
## sd(mu1_P_top10)                    1445     2500
## sd(mu2_Intercept)                  1273     2058
## sd(mu2_P_top10)                    1363     2001
## sd(hu1_Intercept)                  1297     2058
## sd(hu1_P_top10)                     765      948
## sd(hu2_Intercept)                   941     2053
## sd(hu2_P_top10)                    1255     2314
## cor(mu1_Intercept,mu1_P_top10)     1343     1993
## cor(mu2_Intercept,mu2_P_top10)     1990     2500
## cor(hu1_Intercept,hu1_P_top10)     1878     2605
## cor(hu2_Intercept,hu2_P_top10)     1402     1818
## 
## ~field (Number of levels: 19) 
##                                Estimate Est.Error l-95% CI u-95% CI Rhat
## sd(mu1_Intercept)                  0.30      0.06     0.21     0.42 1.00
## sd(mu1_P_top10)                    0.10      0.03     0.06     0.16 1.00
## sd(mu2_Intercept)                  0.18      0.03     0.13     0.26 1.00
## sd(mu2_P_top10)                    0.02      0.01     0.01     0.03 1.00
## sd(hu1_Intercept)                  1.69      0.25     1.26     2.27 1.00
## sd(hu1_P_top10)                    0.24      0.05     0.15     0.36 1.00
## sd(hu2_Intercept)                  1.88      0.32     1.37     2.59 1.00
## sd(hu2_P_top10)                    0.37      0.08     0.25     0.55 1.00
## sd(theta1_Intercept)               1.12      0.25     0.70     1.67 1.00
## cor(mu1_Intercept,mu1_P_top10)     0.34      0.20    -0.10     0.68 1.00
## cor(mu2_Intercept,mu2_P_top10)    -0.25      0.26    -0.70     0.30 1.00
## cor(hu1_Intercept,hu1_P_top10)    -0.38      0.20    -0.72     0.04 1.00
## cor(hu2_Intercept,hu2_P_top10)     0.21      0.22    -0.23     0.60 1.00
##                                Bulk_ESS Tail_ESS
## sd(mu1_Intercept)                  1359     2258
## sd(mu1_P_top10)                    1411     2328
## sd(mu2_Intercept)                  1235     2105
## sd(mu2_P_top10)                    1324     2516
## sd(hu1_Intercept)                  1407     2147
## sd(hu1_P_top10)                    1879     2609
## sd(hu2_Intercept)                  1175     2101
## sd(hu2_P_top10)                    1569     2636
## sd(theta1_Intercept)                850     1212
## cor(mu1_Intercept,mu1_P_top10)     2013     2483
## cor(mu2_Intercept,mu2_P_top10)     2904     2786
## cor(hu1_Intercept,hu1_P_top10)     2703     2679
## cor(hu2_Intercept,hu2_P_top10)     2358     2879
## 
## Population-Level Effects: 
##                  Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## mu1_Intercept        6.85      0.09     6.66     7.02 1.01      403      928
## hu1_Intercept       -0.90      0.36    -1.64    -0.25 1.00      640     1286
## mu2_Intercept        7.57      0.04     7.49     7.66 1.00      669     1237
## hu2_Intercept       -0.14      0.42    -0.89     0.75 1.00      727     1291
## theta1_Intercept    -0.18      0.27    -0.67     0.36 1.01      554      945
## mu1_P_top10          0.13      0.04     0.05     0.21 1.00     1364     2004
## hu1_P_top10          0.09      0.12    -0.14     0.35 1.00     1555     2421
## mu2_P_top10          0.00      0.01    -0.01     0.02 1.00     1570     2168
## hu2_P_top10         -0.13      0.15    -0.41     0.17 1.01      979     2054
## 
## Family Specific Parameters: 
##        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma1     0.80      0.00     0.79     0.80 1.00     9144     2683
## sigma2     0.20      0.00     0.20     0.20 1.00     8676     3341
## 
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

“Marginal effects”

Here we compute marginal effects manually, by making predictions for a given x (P_top10) and then the same x * 1.01, i.e., increasing x (on the original scale) by 1%, and then comparing the predictions.

Fields

scale_offset <- attributes(base$P_top10)[["scaled:center"]]
x1_identity <- 1500
x2_identity <- x1_identity * 1.1
x1_log <- log(x1_identity) - scale_offset
x2_log <- log(x2_identity) - scale_offset


contrast <- predictions(
  hm,
  newdata = datagrid(P_top10 = c(x1_log, x2_log),
                     country = "Brazil", field = unique(base$field))) %>% 
  posteriordraws()
  
contrast_recomputed <- contrast %>% 
  mutate(x = factor(P_top10, labels = c("base", "step"))) %>% 
  pivot_wider(-c(predicted, conf.low, conf.high, P_top10, rowid), 
              names_from = x, values_from = draw) %>% 
  mutate(contrast = step / base - 1)
contrast_recomputed %>% 
  ggplot(aes(contrast, fct_reorder(field, contrast))) +
  stat_halfeye() +
  scale_x_continuous(labels = percent)

This is very sensitive to the respective country. Maybe we can recompute an average marginal effect after all?

average_draws <- function(model, orig_var, q = .5, 
                                var_for_offset = base$P_top10) {
  scale_offset <- attributes(var_for_offset)[["scaled:center"]]
  
  x1_identity <- quantile(orig_var, q)
  x2_identity <- x1_identity * 1.01
  x1_log <- log(x1_identity) - scale_offset
  x2_log <- log(x2_identity) - scale_offset
  
  
  contrast_all <- predictions(
    model,
    newdata = datagrid(P_top10 = c(x1_log, x2_log),
                       country = unique(base$country), 
                       field = unique(base$field))) %>% 
    posteriordraws()
    
  contrast_all %>% 
    mutate(x = factor(P_top10, labels = c("base", "step"))) %>% 
    pivot_wider(-c(predicted, conf.low, conf.high, P_top10, rowid), 
                names_from = x, values_from = draw) %>% 
    mutate(contrast = step / base - 1)
}

summarise_by <- function(contrast_df, var = field) {
  contrast_df %>% 
    group_by({{ var }}, drawid) %>% 
    summarise(effect = mean(contrast))
}
plot_effect <- function(contrast_df, location = "the median") {
  contrast_df %>% 
    ggplot(aes(effect, fct_reorder(field, effect))) +
    stat_halfeye(.width = c(.5, .9), point_interval = "median_hdi") +
    scale_x_continuous(labels = percent) +
    labs(
      y = NULL, 
      x = glue::glue("% change of APC for 1% change of P_top10% at {location}"),
      caption = "Averaged predictions over all countries.") +
    theme_clean() +
    coord_cartesian(xlim = c(-0.005, 0.005))
}
contrast_20 <- average_draws(hm, df$P_top10, q = .2)
contrast_50 <- average_draws(hm, df$P_top10, q = .5)
contrast_80 <- average_draws(hm, df$P_top10, q = .8)
contrast_20_field <- summarise_by(contrast_20, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_20_field %>% 
  plot_effect("the 20% quantile")

This seems reasonable, but would need to further validate.

At 50%

contrast_50_field <- summarise_by(contrast_50, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_50_field %>% 
  plot_effect()

contrast_80_field <- summarise_by(contrast_80, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_80_field %>% 
  plot_effect()

Compare all three

all_joined <- bind_rows(
  rename(contrast_50_field, effect_50 = effect),
  rename(contrast_80_field, effect_80 = effect),
  rename(contrast_20_field, effect_20 = effect)
)
p <- all_joined %>% 
  pivot_longer(contains("effect"), values_to = "effect") %>% 
  drop_na() %>% 
  make_nice_effect_quantiles(name, name) %>% 
  ggplot(aes(effect, fct_reorder(field, effect), colour = name)) +
  geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
  stat_pointinterval(position = position_dodge(width = .5),
                     .width = c(.5, .9)) +
  scale_x_continuous(labels = percent) +
  labs(
    y = NULL, 
    x = expression(paste("% change of APC for 1% change of ", P["top 10%"], 
                         " at given quantiles")),
    caption = "Predictions averaged over all countries.") +
  theme_clean() +
  coord_cartesian(xlim = c(-0.005, 0.005)) +
  guides(colour = guide_legend(reverse = FALSE))
p + 
  effect_scale +
  theme(legend.position = "top", legend.justification = c(1, 0)) + 
  labs(colour = NULL) 

Questions that arise: - why in some fields stronger/weaker effect for larger/smaller P_top10? Is this also associated with hurdle component? - Especially: why physics and mathematics negative? because of hurdle? -> Yes

Need to put into context of overall averages: give average per field (from model or full set)

Intercept at median

contrast_50 %>% 
  group_by(field, drawid) %>% 
  summarise(intercept = mean(base)) %>% 
  ggplot(aes(intercept, fct_reorder(field, intercept))) +
  stat_halfeye(.width = c(.5, .9), fill = colorspace::lighten("#007FA8"),
               point_interval = "median_hdi") +
  scale_x_continuous(labels = scales::dollar) +
  theme_clean() +
  labs(y = NULL, x = expression(paste("Estimated APC at median of ",
                                      P["top 10%"])))
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.

Countries

contrast_20_country <- summarise_by(contrast_20, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
contrast_50_country <- summarise_by(contrast_50, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
contrast_80_country <- summarise_by(contrast_80, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
all_countries <- bind_rows(
  rename(contrast_20_country, effect_20 = effect),
  rename(contrast_50_country, effect_50 = effect),
  rename(contrast_80_country, effect_80 = effect),
) %>% 
  pivot_longer(contains("effect"), values_to = "effect") %>% 
  drop_na()
p_country <- all_countries %>% 
  ggplot(aes(effect, fct_reorder(country, effect), colour = name)) +
  geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
  stat_pointinterval(position = position_dodge(width = .5),
                     .width = c(.5, .9), point_interval = "median_hdi") +
  scale_x_continuous(labels = percent) +
  labs(
    y = NULL, 
    x = glue::glue("% Change of APC for 1% change of P_top10% at given quantiles"),
    caption = "Averaged predictions over all countries.") +
  theme_clean() +
  coord_cartesian(xlim = c(-0.001, 0.005)) +
  guides(colour = guide_legend(reverse = TRUE))
p_country + scale_color_manual(values = c(
    effect_20 = colorspace::lighten("#007FA8", .7),
    effect_50 = colorspace::lighten("#007FA8", .4),
    effect_80 = "#007FA8"
  ))

Group by continent

country_identifier <- base %>% 
  distinct(country, region, country_code) %>% 
  drop_na()

all_countries_w_region <- all_countries %>% 
  left_join(country_identifier)
## Joining, by = "country"
plot_countries_by_region <- function(df) {
  df %>% 
    make_nice_effect_quantiles(name, color_label) %>% 
    ggplot(aes(effect, fct_reorder(country, effect), colour = color_label)) +
    geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
    stat_pointinterval(position = position_dodge(width = .5),
                       .width = c(.5, .9), point_interval = "median_hdi") +
    scale_x_continuous(labels = percent) +
    labs(
      y = NULL, 
      x = expression(paste("% Change of APC for 1% change of ", P["top 10%"], 
                         " at given quantiles")),
      caption = "Predictions averaged over all fields.",
      colour = NULL) +
    theme_clean() +
    facet_grid(rows = vars(str_wrap(region, 9)), space = "free_y", scales = "free_y") +
    # coord_cartesian(xlim = c(-0.001, 0.005)) +
    guides(colour = guide_legend(reverse = FALSE, override.aes = list(size = 2))) +
    theme(legend.position = "top") +
    effect_scale
}

p_europe_subsahara <- all_countries_w_region %>% 
  filter(str_detect(region, "Europe|Sahara")) %>% 
  plot_countries_by_region()

p_rest <- all_countries_w_region %>% 
  filter(!str_detect(region, "Europe|Sahara")) %>% 
  plot_countries_by_region()
p_europe_subsahara

p_rest

Relate to gdp

gdp <- WDI::WDI(start = 2019, end = 2019)

country_effect_with_gdp <- all_countries_w_region %>% 
  left_join(gdp, by = c("country_code" = "iso2c"))
country_effect_with_gdp
## # A tibble: 828,000 × 10
##    country.x drawid name       effect region count…¹ count…² iso3c  year NY.GD…³
##    <chr>     <fct>  <chr>       <dbl> <chr>  <chr>   <chr>   <chr> <int>   <dbl>
##  1 Algeria   1      effect_… -1.09e-3 Middl… DZ      Algeria DZA    2019   4115.
##  2 Algeria   2      effect_…  4.52e-3 Middl… DZ      Algeria DZA    2019   4115.
##  3 Algeria   3      effect_…  1.11e-3 Middl… DZ      Algeria DZA    2019   4115.
##  4 Algeria   4      effect_…  3.03e-3 Middl… DZ      Algeria DZA    2019   4115.
##  5 Algeria   5      effect_…  1.16e-3 Middl… DZ      Algeria DZA    2019   4115.
##  6 Algeria   6      effect_…  3.13e-3 Middl… DZ      Algeria DZA    2019   4115.
##  7 Algeria   7      effect_…  9.06e-4 Middl… DZ      Algeria DZA    2019   4115.
##  8 Algeria   8      effect_…  7.61e-6 Middl… DZ      Algeria DZA    2019   4115.
##  9 Algeria   9      effect_…  4.60e-3 Middl… DZ      Algeria DZA    2019   4115.
## 10 Algeria   10     effect_… -1.17e-3 Middl… DZ      Algeria DZA    2019   4115.
## # … with 827,990 more rows, and abbreviated variable names ¹​country_code,
## #   ²​country.y, ³​NY.GDP.PCAP.KD
p <- country_effect_with_gdp %>% 
  filter(name == "effect_50") %>% 
  rename(gdp = NY.GDP.PCAP.KD) %>% 
  group_by(country_code, country.x, gdp, region) %>% 
  point_interval(effect, .width = .5, .point = median, .interval = hdi) %>% 
  ggplot(aes(gdp, effect, colour = region, label = country.x)) +
  geom_pointrange(aes(ymin = .lower, ymax = .upper)) +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::comma) +
  scale_color_discrete_qualitative(palette = "Dark 3") +
  labs(colour = NULL, x = "GDP per capita", 
       y = "% increase in APC for 1% increase of P_top10 at the median") +
  theme_clean() +
  theme(legend.position = "top")  
  #coord_cartesian(ylim = c(0, .003))
p
## Warning: Removed 1 rows containing missing values (geom_pointrange).

plotly::ggplotly(p)

This could still be improved by adding number of universities (or papers) as a size variable.

unis_p_country <- df %>% 
  distinct(country, University) %>% 
  count(country, name = "n_universities") 
pdata <- country_effect_with_gdp %>% 
  filter(name == "effect_50") %>% 
  rename(gdp = NY.GDP.PCAP.KD) %>% 
  group_by(country_code, country.x, gdp, region) %>% 
  point_interval(effect, .width = .5, .point = median, .interval = hdi) %>% 
  left_join(unis_p_country, by = c("country.x" = "country"))

labels <- pdata %>% 
  mutate(label = case_when(
    country.x %in% c("China", "India", "Iran", "Germany", "United States",
                      "Brazil", "Luxembourg", "Czech Republic") ~ country.x,
    TRUE ~ ""))

pdata %>% 
  ggplot(aes(gdp, effect, colour = region, label = "")) +
  geom_linerange(aes(ymin = .lower, ymax = .upper, alpha = n_universities)) +
  geom_point(aes(alpha = n_universities), size = 2) +
  ggrepel::geom_text_repel(data = labels, aes(label = label),
                            show.legend = FALSE, max.overlaps = Inf,
                           box.padding = 1, min.segment.length = 0) +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::dollar) +
  scale_color_discrete_qualitative(palette = "Dark 3") +
  # scale_size_continuous(trans = "sqrt") +
  scale_alpha_continuous(range = c(.2, 1), trans = "log10") +
  labs(colour = "Region", x = "GDP per capita", alpha = "Number of universities",
       y = expression(
         paste("% increase in APC for 1% increase of ", P["top 10%"], 
               " at the median")
         )) +
  theme_clean() +
  theme(legend.position = "top", legend.box = "vertical")  
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).

  # coord_cartesian(ylim = c(0, .003))
contrast_50 %>% 
  group_by(country, drawid) %>% 
  summarise(intercept = mean(base)) %>% 
  ggplot(aes(intercept, fct_reorder(country, intercept))) +
  stat_halfeye(.width = c(.5, .9), fill = colorspace::lighten("#007FA8"),
               point_interval = "median_hdi") +
  scale_x_continuous(labels = scales::dollar) +
  theme_clean() +
  labs(y = NULL, x = expression(paste("Estimated APC at median of ",
                                      P["top 10%"])))
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.

Not sure whether this is helpful. What we show is quite abstract (APC at hypothetical level of P_top10, averaged across all fields (weighting equally).

The same would apply to the field intercepts above.